In [2]:
import numpy as np
import pandas as pd
import os
In [3]:
os.chdir("C:\\Users\Morvi panchal\OneDrive\Desktop\Practice data science")
In [4]:
df = pd.read_csv("parkinsons.csv")
df.head()
Out[4]:
name MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP MDVP:Shimmer ... Shimmer:DDA NHR HNR status RPDE DFA spread1 spread2 D2 PPE
0 phon_R01_S01_1 119.992 157.302 74.997 0.00784 0.00007 0.00370 0.00554 0.01109 0.04374 ... 0.06545 0.02211 21.033 1 0.414783 0.815285 -4.813031 0.266482 2.301442 0.284654
1 phon_R01_S01_2 122.400 148.650 113.819 0.00968 0.00008 0.00465 0.00696 0.01394 0.06134 ... 0.09403 0.01929 19.085 1 0.458359 0.819521 -4.075192 0.335590 2.486855 0.368674
2 phon_R01_S01_3 116.682 131.111 111.555 0.01050 0.00009 0.00544 0.00781 0.01633 0.05233 ... 0.08270 0.01309 20.651 1 0.429895 0.825288 -4.443179 0.311173 2.342259 0.332634
3 phon_R01_S01_4 116.676 137.871 111.366 0.00997 0.00009 0.00502 0.00698 0.01505 0.05492 ... 0.08771 0.01353 20.644 1 0.434969 0.819235 -4.117501 0.334147 2.405554 0.368975
4 phon_R01_S01_5 116.014 141.781 110.655 0.01284 0.00011 0.00655 0.00908 0.01966 0.06425 ... 0.10470 0.01767 19.649 1 0.417356 0.823484 -3.747787 0.234513 2.332180 0.410335

5 rows × 24 columns

In [5]:
!pip install pandas-profiling
Requirement already satisfied: pandas-profiling in m:\anaconda\lib\site-packages (3.6.6)
Requirement already satisfied: ydata-profiling in m:\anaconda\lib\site-packages (from pandas-profiling) (4.5.1)
Requirement already satisfied: PyYAML<6.1,>=5.0.0 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (5.3.1)
Requirement already satisfied: jinja2<3.2,>=2.11.1 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (2.11.2)
Requirement already satisfied: numpy<1.24,>=1.16.0 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (1.19.2)
Requirement already satisfied: scipy<1.12,>=1.4.1 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (1.5.2)
Requirement already satisfied: seaborn<0.13,>=0.10.1 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (0.11.0)
Requirement already satisfied: htmlmin==0.1.12 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (0.1.12)
Requirement already satisfied: visions[type_image_path]==0.7.5 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (0.7.5)
Requirement already satisfied: statsmodels<1,>=0.13.2 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (0.14.0)
Requirement already satisfied: dacite>=1.8 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (1.8.1)
Requirement already satisfied: typeguard<3,>=2.13.2 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (2.13.3)
Requirement already satisfied: multimethod<2,>=1.4 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (1.9.1)
Requirement already satisfied: imagehash==4.3.1 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (4.3.1)
Requirement already satisfied: requests<3,>=2.24.0 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (2.24.0)
Requirement already satisfied: tqdm<5,>=4.48.2 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (4.50.2)
Requirement already satisfied: wordcloud>=1.9.1 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (1.9.2)
Requirement already satisfied: matplotlib<4,>=3.2 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (3.3.2)
Requirement already satisfied: pydantic<2,>=1.8.1 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (1.10.12)
Requirement already satisfied: pandas!=1.4.0,<2.1,>1.1 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (1.1.3)
Requirement already satisfied: phik<0.13,>=0.11.1 in m:\anaconda\lib\site-packages (from ydata-profiling->pandas-profiling) (0.12.3)
Requirement already satisfied: MarkupSafe>=0.23 in m:\anaconda\lib\site-packages (from jinja2<3.2,>=2.11.1->ydata-profiling->pandas-profiling) (1.1.1)
Requirement already satisfied: networkx>=2.4 in m:\anaconda\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling->pandas-profiling) (2.5)
Requirement already satisfied: tangled-up-in-unicode>=0.0.4 in m:\anaconda\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling->pandas-profiling) (0.2.0)
Requirement already satisfied: attrs>=19.3.0 in m:\anaconda\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling->pandas-profiling) (20.3.0)
Requirement already satisfied: Pillow; extra == "type_image_path" in m:\anaconda\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling->pandas-profiling) (8.0.1)
Requirement already satisfied: packaging>=21.3 in m:\anaconda\lib\site-packages (from statsmodels<1,>=0.13.2->ydata-profiling->pandas-profiling) (23.1)
Requirement already satisfied: patsy>=0.5.2 in m:\anaconda\lib\site-packages (from statsmodels<1,>=0.13.2->ydata-profiling->pandas-profiling) (0.5.3)
Requirement already satisfied: PyWavelets in m:\anaconda\lib\site-packages (from imagehash==4.3.1->ydata-profiling->pandas-profiling) (1.1.1)
Requirement already satisfied: certifi>=2017.4.17 in m:\anaconda\lib\site-packages (from requests<3,>=2.24.0->ydata-profiling->pandas-profiling) (2020.6.20)
Requirement already satisfied: chardet<4,>=3.0.2 in m:\anaconda\lib\site-packages (from requests<3,>=2.24.0->ydata-profiling->pandas-profiling) (3.0.4)
Requirement already satisfied: idna<3,>=2.5 in m:\anaconda\lib\site-packages (from requests<3,>=2.24.0->ydata-profiling->pandas-profiling) (2.10)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in m:\anaconda\lib\site-packages (from requests<3,>=2.24.0->ydata-profiling->pandas-profiling) (1.25.11)
Requirement already satisfied: kiwisolver>=1.0.1 in m:\anaconda\lib\site-packages (from matplotlib<4,>=3.2->ydata-profiling->pandas-profiling) (1.3.0)
Requirement already satisfied: python-dateutil>=2.1 in m:\anaconda\lib\site-packages (from matplotlib<4,>=3.2->ydata-profiling->pandas-profiling) (2.8.1)
Requirement already satisfied: cycler>=0.10 in m:\anaconda\lib\site-packages (from matplotlib<4,>=3.2->ydata-profiling->pandas-profiling) (0.10.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in m:\anaconda\lib\site-packages (from matplotlib<4,>=3.2->ydata-profiling->pandas-profiling) (2.4.7)
Requirement already satisfied: typing-extensions>=4.2.0 in m:\anaconda\lib\site-packages (from pydantic<2,>=1.8.1->ydata-profiling->pandas-profiling) (4.8.0)
Requirement already satisfied: pytz>=2017.2 in m:\anaconda\lib\site-packages (from pandas!=1.4.0,<2.1,>1.1->ydata-profiling->pandas-profiling) (2020.1)
Requirement already satisfied: joblib>=0.14.1 in m:\anaconda\lib\site-packages (from phik<0.13,>=0.11.1->ydata-profiling->pandas-profiling) (0.17.0)
Requirement already satisfied: decorator>=4.3.0 in m:\anaconda\lib\site-packages (from networkx>=2.4->visions[type_image_path]==0.7.5->ydata-profiling->pandas-profiling) (4.4.2)
Requirement already satisfied: six in m:\anaconda\lib\site-packages (from patsy>=0.5.2->statsmodels<1,>=0.13.2->ydata-profiling->pandas-profiling) (1.15.0)
In [6]:
import pandas_profiling as pf
display(pf.ProfileReport(df))
<ipython-input-6-7c263133eb4e>:1: DeprecationWarning: `import pandas_profiling` is going to be deprecated by April 1st. Please use `import ydata_profiling` instead.
  import pandas_profiling as pf




In [7]:
display(df.shape)
(195, 24)
In [8]:
display(len(df))
195
In [9]:
print(df.info)
<bound method DataFrame.info of                name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0    phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1    phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2    phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3    phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4    phon_R01_S01_5      116.014       141.781       110.655         0.01284   
..              ...          ...           ...           ...             ...   
190  phon_R01_S50_2      174.188       230.978        94.261         0.00459   
191  phon_R01_S50_3      209.516       253.017        89.488         0.00564   
192  phon_R01_S50_4      174.688       240.005        74.287         0.01360   
193  phon_R01_S50_5      198.764       396.961        74.904         0.00740   
194  phon_R01_S50_6      214.289       260.277        77.973         0.00567   

     MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0             0.00007   0.00370   0.00554     0.01109       0.04374  ...   
1             0.00008   0.00465   0.00696     0.01394       0.06134  ...   
2             0.00009   0.00544   0.00781     0.01633       0.05233  ...   
3             0.00009   0.00502   0.00698     0.01505       0.05492  ...   
4             0.00011   0.00655   0.00908     0.01966       0.06425  ...   
..                ...       ...       ...         ...           ...  ...   
190           0.00003   0.00263   0.00259     0.00790       0.04087  ...   
191           0.00003   0.00331   0.00292     0.00994       0.02751  ...   
192           0.00008   0.00624   0.00564     0.01873       0.02308  ...   
193           0.00004   0.00370   0.00390     0.01109       0.02296  ...   
194           0.00003   0.00295   0.00317     0.00885       0.01884  ...   

     Shimmer:DDA      NHR     HNR  status      RPDE       DFA   spread1  \
0        0.06545  0.02211  21.033       1  0.414783  0.815285 -4.813031   
1        0.09403  0.01929  19.085       1  0.458359  0.819521 -4.075192   
2        0.08270  0.01309  20.651       1  0.429895  0.825288 -4.443179   
3        0.08771  0.01353  20.644       1  0.434969  0.819235 -4.117501   
4        0.10470  0.01767  19.649       1  0.417356  0.823484 -3.747787   
..           ...      ...     ...     ...       ...       ...       ...   
190      0.07008  0.02764  19.517       0  0.448439  0.657899 -6.538586   
191      0.04812  0.01810  19.147       0  0.431674  0.683244 -6.195325   
192      0.03804  0.10715  17.883       0  0.407567  0.655683 -6.787197   
193      0.03794  0.07223  19.020       0  0.451221  0.643956 -6.744577   
194      0.03078  0.04398  21.209       0  0.462803  0.664357 -5.724056   

      spread2        D2       PPE  
0    0.266482  2.301442  0.284654  
1    0.335590  2.486855  0.368674  
2    0.311173  2.342259  0.332634  
3    0.334147  2.405554  0.368975  
4    0.234513  2.332180  0.410335  
..        ...       ...       ...  
190  0.121952  2.657476  0.133050  
191  0.129303  2.784312  0.168895  
192  0.158453  2.679772  0.131728  
193  0.207454  2.138608  0.123306  
194  0.190667  2.555477  0.148569  

[195 rows x 24 columns]>
In [10]:
display(df.isna().sum())
name                0
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
status              0
RPDE                0
DFA                 0
spread1             0
spread2             0
D2                  0
PPE                 0
dtype: int64
In [11]:
print(df.status)
0      1
1      1
2      1
3      1
4      1
      ..
190    0
191    0
192    0
193    0
194    0
Name: status, Length: 195, dtype: int64
In [12]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
df.status.hist()
plt.xlabel('status')
plt.ylabel('Frequencies')
plt.plot()
plt.show()
In [13]:
import seaborn as sns
In [14]:
plt.figure(figsize=(10,6))
sns.barplot(x='status',y="HNR",data=df)
plt.show()
In [15]:
plt.figure(figsize=(10,6))
sns.barplot(x='status',y='RPDE',data=df);
plt.show()
In [16]:
import warnings
warnings.filterwarnings('ignore')
rows=3
cols=7
fig, ax=plt.subplots(nrows=rows,ncols=cols,figsize=(16,4))
col=df.columns
index=1
for i in range(rows):
    for j in range(cols):
        sns.distplot(df[col[index]],ax=ax[i][j])
        index=index+1
        
plt.tight_layout()
plt.show()
In [17]:
corr = df.corr()
display(corr)
MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP MDVP:Shimmer MDVP:Shimmer(dB) ... Shimmer:DDA NHR HNR status RPDE DFA spread1 spread2 D2 PPE
MDVP:Fo(Hz) 1.000000 0.400985 0.596546 -0.118003 -0.382027 -0.076194 -0.112165 -0.076213 -0.098374 -0.073742 ... -0.094732 -0.021981 0.059144 -0.383535 -0.383894 -0.446013 -0.413738 -0.249450 0.177980 -0.372356
MDVP:Fhi(Hz) 0.400985 1.000000 0.084951 0.102086 -0.029198 0.097177 0.091126 0.097150 0.002281 0.043465 ... -0.003733 0.163766 -0.024893 -0.166136 -0.112404 -0.343097 -0.076658 -0.002954 0.176323 -0.069543
MDVP:Flo(Hz) 0.596546 0.084951 1.000000 -0.139919 -0.277815 -0.100519 -0.095828 -0.100488 -0.144543 -0.119089 ... -0.150737 -0.108670 0.210851 -0.380200 -0.400143 -0.050406 -0.394857 -0.243829 -0.100629 -0.340071
MDVP:Jitter(%) -0.118003 0.102086 -0.139919 1.000000 0.935714 0.990276 0.974256 0.990276 0.769063 0.804289 ... 0.746635 0.906959 -0.728165 0.278220 0.360673 0.098572 0.693577 0.385123 0.433434 0.721543
MDVP:Jitter(Abs) -0.382027 -0.029198 -0.277815 0.935714 1.000000 0.922911 0.897778 0.922913 0.703322 0.716601 ... 0.697170 0.834972 -0.656810 0.338653 0.441839 0.175036 0.735779 0.388543 0.310694 0.748162
MDVP:RAP -0.076194 0.097177 -0.100519 0.990276 0.922911 1.000000 0.957317 1.000000 0.759581 0.790652 ... 0.744919 0.919521 -0.721543 0.266668 0.342140 0.064083 0.648328 0.324407 0.426605 0.670999
MDVP:PPQ -0.112165 0.091126 -0.095828 0.974256 0.897778 0.957317 1.000000 0.957319 0.797826 0.839239 ... 0.763592 0.844604 -0.731510 0.288698 0.333274 0.196301 0.716489 0.407605 0.412524 0.769647
Jitter:DDP -0.076213 0.097150 -0.100488 0.990276 0.922913 1.000000 0.957319 1.000000 0.759555 0.790621 ... 0.744901 0.919548 -0.721494 0.266646 0.342079 0.064026 0.648328 0.324377 0.426556 0.671005
MDVP:Shimmer -0.098374 0.002281 -0.144543 0.769063 0.703322 0.759581 0.797826 0.759555 1.000000 0.987258 ... 0.987626 0.722194 -0.835271 0.367430 0.447424 0.159954 0.654734 0.452025 0.507088 0.693771
MDVP:Shimmer(dB) -0.073742 0.043465 -0.119089 0.804289 0.716601 0.790652 0.839239 0.790621 0.987258 1.000000 ... 0.963202 0.744477 -0.827805 0.350697 0.410684 0.165157 0.652547 0.454314 0.512233 0.695058
Shimmer:APQ3 -0.094717 -0.003743 -0.150747 0.746625 0.697153 0.744912 0.763580 0.744894 0.987625 0.963198 ... 1.000000 0.716207 -0.827123 0.347617 0.435242 0.151124 0.610967 0.402243 0.467265 0.645377
Shimmer:APQ5 -0.070682 -0.009997 -0.101095 0.725561 0.648961 0.709927 0.786780 0.709907 0.982835 0.973751 ... 0.960072 0.658080 -0.813753 0.351148 0.399903 0.213873 0.646809 0.457195 0.502174 0.702456
MDVP:APQ -0.077774 0.004937 -0.107293 0.758255 0.648793 0.737455 0.804139 0.737439 0.950083 0.960977 ... 0.896647 0.694019 -0.800407 0.364316 0.451379 0.157276 0.673158 0.502188 0.536869 0.721694
Shimmer:DDA -0.094732 -0.003733 -0.150737 0.746635 0.697170 0.744919 0.763592 0.744901 0.987626 0.963202 ... 1.000000 0.716215 -0.827130 0.347608 0.435237 0.151132 0.610971 0.402223 0.467261 0.645389
NHR -0.021981 0.163766 -0.108670 0.906959 0.834972 0.919521 0.844604 0.919548 0.722194 0.744477 ... 0.716215 1.000000 -0.714072 0.189429 0.370890 -0.131882 0.540865 0.318099 0.470949 0.552591
HNR 0.059144 -0.024893 0.210851 -0.728165 -0.656810 -0.721543 -0.731510 -0.721494 -0.835271 -0.827805 ... -0.827130 -0.714072 1.000000 -0.361515 -0.598736 -0.008665 -0.673210 -0.431564 -0.601401 -0.692876
status -0.383535 -0.166136 -0.380200 0.278220 0.338653 0.266668 0.288698 0.266646 0.367430 0.350697 ... 0.347608 0.189429 -0.361515 1.000000 0.308567 0.231739 0.564838 0.454842 0.340232 0.531039
RPDE -0.383894 -0.112404 -0.400143 0.360673 0.441839 0.342140 0.333274 0.342079 0.447424 0.410684 ... 0.435237 0.370890 -0.598736 0.308567 1.000000 -0.110950 0.591117 0.479905 0.236931 0.545886
DFA -0.446013 -0.343097 -0.050406 0.098572 0.175036 0.064083 0.196301 0.064026 0.159954 0.165157 ... 0.151132 -0.131882 -0.008665 0.231739 -0.110950 1.000000 0.195668 0.166548 -0.165381 0.270445
spread1 -0.413738 -0.076658 -0.394857 0.693577 0.735779 0.648328 0.716489 0.648328 0.654734 0.652547 ... 0.610971 0.540865 -0.673210 0.564838 0.591117 0.195668 1.000000 0.652358 0.495123 0.962435
spread2 -0.249450 -0.002954 -0.243829 0.385123 0.388543 0.324407 0.407605 0.324377 0.452025 0.454314 ... 0.402223 0.318099 -0.431564 0.454842 0.479905 0.166548 0.652358 1.000000 0.523532 0.644711
D2 0.177980 0.176323 -0.100629 0.433434 0.310694 0.426605 0.412524 0.426556 0.507088 0.512233 ... 0.467261 0.470949 -0.601401 0.340232 0.236931 -0.165381 0.495123 0.523532 1.000000 0.480585
PPE -0.372356 -0.069543 -0.340071 0.721543 0.748162 0.670999 0.769647 0.671005 0.693771 0.695058 ... 0.645389 0.552591 -0.692876 0.531039 0.545886 0.270445 0.962435 0.644711 0.480585 1.000000

23 rows × 23 columns

In [18]:
df.drop(['name'],axis=1,inplace=True)
display (df)
MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP MDVP:Shimmer MDVP:Shimmer(dB) ... Shimmer:DDA NHR HNR status RPDE DFA spread1 spread2 D2 PPE
0 119.992 157.302 74.997 0.00784 0.00007 0.00370 0.00554 0.01109 0.04374 0.426 ... 0.06545 0.02211 21.033 1 0.414783 0.815285 -4.813031 0.266482 2.301442 0.284654
1 122.400 148.650 113.819 0.00968 0.00008 0.00465 0.00696 0.01394 0.06134 0.626 ... 0.09403 0.01929 19.085 1 0.458359 0.819521 -4.075192 0.335590 2.486855 0.368674
2 116.682 131.111 111.555 0.01050 0.00009 0.00544 0.00781 0.01633 0.05233 0.482 ... 0.08270 0.01309 20.651 1 0.429895 0.825288 -4.443179 0.311173 2.342259 0.332634
3 116.676 137.871 111.366 0.00997 0.00009 0.00502 0.00698 0.01505 0.05492 0.517 ... 0.08771 0.01353 20.644 1 0.434969 0.819235 -4.117501 0.334147 2.405554 0.368975
4 116.014 141.781 110.655 0.01284 0.00011 0.00655 0.00908 0.01966 0.06425 0.584 ... 0.10470 0.01767 19.649 1 0.417356 0.823484 -3.747787 0.234513 2.332180 0.410335
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
190 174.188 230.978 94.261 0.00459 0.00003 0.00263 0.00259 0.00790 0.04087 0.405 ... 0.07008 0.02764 19.517 0 0.448439 0.657899 -6.538586 0.121952 2.657476 0.133050
191 209.516 253.017 89.488 0.00564 0.00003 0.00331 0.00292 0.00994 0.02751 0.263 ... 0.04812 0.01810 19.147 0 0.431674 0.683244 -6.195325 0.129303 2.784312 0.168895
192 174.688 240.005 74.287 0.01360 0.00008 0.00624 0.00564 0.01873 0.02308 0.256 ... 0.03804 0.10715 17.883 0 0.407567 0.655683 -6.787197 0.158453 2.679772 0.131728
193 198.764 396.961 74.904 0.00740 0.00004 0.00370 0.00390 0.01109 0.02296 0.241 ... 0.03794 0.07223 19.020 0 0.451221 0.643956 -6.744577 0.207454 2.138608 0.123306
194 214.289 260.277 77.973 0.00567 0.00003 0.00295 0.00317 0.00885 0.01884 0.190 ... 0.03078 0.04398 21.209 0 0.462803 0.664357 -5.724056 0.190667 2.555477 0.148569

195 rows × 23 columns

In [19]:
x = df.drop('status',axis=1)
display(x.head())
MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP MDVP:Shimmer MDVP:Shimmer(dB) ... MDVP:APQ Shimmer:DDA NHR HNR RPDE DFA spread1 spread2 D2 PPE
0 119.992 157.302 74.997 0.00784 0.00007 0.00370 0.00554 0.01109 0.04374 0.426 ... 0.02971 0.06545 0.02211 21.033 0.414783 0.815285 -4.813031 0.266482 2.301442 0.284654
1 122.400 148.650 113.819 0.00968 0.00008 0.00465 0.00696 0.01394 0.06134 0.626 ... 0.04368 0.09403 0.01929 19.085 0.458359 0.819521 -4.075192 0.335590 2.486855 0.368674
2 116.682 131.111 111.555 0.01050 0.00009 0.00544 0.00781 0.01633 0.05233 0.482 ... 0.03590 0.08270 0.01309 20.651 0.429895 0.825288 -4.443179 0.311173 2.342259 0.332634
3 116.676 137.871 111.366 0.00997 0.00009 0.00502 0.00698 0.01505 0.05492 0.517 ... 0.03772 0.08771 0.01353 20.644 0.434969 0.819235 -4.117501 0.334147 2.405554 0.368975
4 116.014 141.781 110.655 0.01284 0.00011 0.00655 0.00908 0.01966 0.06425 0.584 ... 0.04465 0.10470 0.01767 19.649 0.417356 0.823484 -3.747787 0.234513 2.332180 0.410335

5 rows × 22 columns

In [20]:
y = df.status
display(y.head())
0    1
1    1
2    1
3    1
4    1
Name: status, dtype: int64
In [21]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=40)
In [23]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
lg= LogisticRegression().fit(x_train, y_train)
train_preds = lg.predict(x_train)
print("Model accuracy on train is: ", accuracy_score(y_train, train_preds))
test_preds = lg.predict(x_test)
print("Model accuracy on test is: ", accuracy_score(y_test, test_preds))
print('-'*50)
Model accuracy on train is:  0.8782051282051282
Model accuracy on test is:  0.8461538461538461
--------------------------------------------------
In [ ]:
print("confusion_matrix train is:\n ", confusion_matrix(y_train, train_preds))
print("confusion_matrix test is:\n ", confusion_matrix(y_test, test_preds))
print('\nClassification Report Train is ')
print(classification_report (y_train, train_preds))
print('\nClassification Report Test is ')
print(classification_report (y_test, test_preds))
In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 
rf=RandomForestClassifier().fit(x_train,y_train)
trainpreds2 = rf.predict(x_train)
print("Model accuracy on train is: ", accuracy_score(y_train, trainpreds2))
testpreds2 = rf.predict(x_test)
print("Model accuracy on test is: ", accuracy_score(y_test, testpreds2))
Model accuracy on train is:  1.0
Model accuracy on test is:  0.9230769230769231
In [25]:
print("Confusion matrix of train is: ",confusion_matrix(y_train,trainpreds2))
print("Confusion matrix of test is: ",confusion_matrix(y_test,testpreds2))
print("Classification matrix of train is: ",classification_report(y_train,trainpreds2))
print("Classification matrix of test is: ",classification_report(y_test,testpreds2))
Confusion matrix of train is:  [[ 40   0]
 [  0 116]]
Confusion matrix of test is:  [[ 6  2]
 [ 1 30]]
Classification matrix of train is:                precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00       116

    accuracy                           1.00       156
   macro avg       1.00      1.00      1.00       156
weighted avg       1.00      1.00      1.00       156

Classification matrix of test is:                precision    recall  f1-score   support

           0       0.86      0.75      0.80         8
           1       0.94      0.97      0.95        31

    accuracy                           0.92        39
   macro avg       0.90      0.86      0.88        39
weighted avg       0.92      0.92      0.92        39

In [26]:
print((y_test !=testpreds2).sum(),'/',((y_test == testpreds2).sum()+(y_test != testpreds2).sum()))
3 / 39
In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 
import os
In [28]:
print('The kappa sore is: ',metrics.cohen_kappa_score(y_test,testpreds2))
The kappa sore is:  0.7526427061310782
In [29]:
ddf = pd.DataFrame(data=[testpreds2,y_test])
display(ddf)
0 1 2 3 4 5 6 7 8 9 ... 29 30 31 32 33 34 35 36 37 38
0 1 1 1 0 1 1 1 1 1 1 ... 1 0 0 0 1 1 1 1 1 1
1 1 1 1 0 1 0 1 1 1 0 ... 1 0 0 0 1 1 1 1 1 1

2 rows × 39 columns

In [30]:
display(ddf.T)
0 1
0 1 1
1 1 1
2 1 1
3 0 0
4 1 1
5 1 0
6 1 1
7 1 1
8 1 1
9 1 0
10 1 1
11 1 1
12 0 0
13 1 1
14 1 1
15 1 1
16 1 1
17 1 1
18 1 1
19 1 1
20 1 1
21 0 1
22 1 1
23 1 1
24 0 0
25 1 1
26 1 1
27 1 1
28 1 1
29 1 1
30 0 0
31 0 0
32 0 0
33 1 1
34 1 1
35 1 1
36 1 1
37 1 1
38 1 1
In [31]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier().fit(x,y)
trainpred3 = dt.predict(x_train)
print("The accuracy of decision tree(train) is: ",accuracy_score(y_train,trainpred3))
testpreds3 = dt.predict(x_test)
print("The accuracy of decision trr(test) is: ",accuracy_score(y_test,testpreds3))
The accuracy of decision tree(train) is:  1.0
The accuracy of decision trr(test) is:  1.0
In [32]:
print("The train confusion matric is: ",confusion_matrix(y_train,trainpred3))
print("The test confusion matric is: ",confusion_matrix(y_test,testpreds3))
The train confusion matric is:  [[ 40   0]
 [  0 116]]
The test confusion matric is:  [[ 8  0]
 [ 0 31]]
In [33]:
print("The classification report is for train: ",classification_report(y_train,trainpred3))
print("The classification report is for test: ",classification_report(y_test,testpreds3))
The classification report is for train:                precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00       116

    accuracy                           1.00       156
   macro avg       1.00      1.00      1.00       156
weighted avg       1.00      1.00      1.00       156

The classification report is for test:                precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00        31

    accuracy                           1.00        39
   macro avg       1.00      1.00      1.00        39
weighted avg       1.00      1.00      1.00        39

In [34]:
print((y_test !=testpreds3).sum(),'/',((y_test == testpreds3).sum()+(y_test != testpreds3).sum()))
0 / 39
In [35]:
print("Kappa score is: ",metrics.cohen_kappa_score(y_test,testpreds3))
Kappa score is:  1.0
In [36]:
from sklearn.naive_bayes import GaussianNB
#fit the model on train data 
nb=GaussianNB()
nb.fit(x_train,y_train) 
trainpreds4 = nb.predict(x_train)
print("Model accuracy on train is: ", accuracy_score(y_train, trainpreds4))
testpreds4 = nb.predict(x_test)
print("Model accuracy on test is: ", accuracy_score(y_test, testpreds4))
Model accuracy on train is:  0.7307692307692307
Model accuracy on test is:  0.6923076923076923
In [37]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 
print("Classification matrix of train is : ",confusion_matrix(y_train,trainpreds4))
print("Classification matrix of test is: ",confusion_matrix(y_test,testpreds4))
Classification matrix of train is :  [[38  2]
 [40 76]]
Classification matrix of test is:  [[ 8  0]
 [12 19]]
In [38]:
print("Classification matrix of train is: ",classification_report(y_train,trainpreds4))
print("Classification matrix of test is: ",classification_report(y_test,testpreds4))
Classification matrix of train is:                precision    recall  f1-score   support

           0       0.49      0.95      0.64        40
           1       0.97      0.66      0.78       116

    accuracy                           0.73       156
   macro avg       0.73      0.80      0.71       156
weighted avg       0.85      0.73      0.75       156

Classification matrix of test is:                precision    recall  f1-score   support

           0       0.40      1.00      0.57         8
           1       1.00      0.61      0.76        31

    accuracy                           0.69        39
   macro avg       0.70      0.81      0.67        39
weighted avg       0.88      0.69      0.72        39

In [39]:
print((y_test!=testpreds4).sum(),'/',((y_test==testpreds4).sum()+(y_test!=testpreds4).sum()))
12 / 39
In [40]:
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier().fit(x_train,y_train)
trainpreds5 = kn.predict(x_train)
print("The accuracy train is: ",accuracy_score(y_train,trainpreds5))
testpreds5 = kn.predict(x_test)
print("The accuracy test is: ",accuracy_score(y_test,testpreds5))
The accuracy train is:  0.9102564102564102
The accuracy test is:  0.8461538461538461
In [41]:
print("Confusion matrix of train is: ",confusion_matrix(y_train,trainpreds5))
print("Confusion matrix of test is: ",confusion_matrix(y_test,testpreds5))
Confusion matrix of train is:  [[ 30  10]
 [  4 112]]
Confusion matrix of test is:  [[ 4  4]
 [ 2 29]]
In [42]:
print("Classification matrix of train is: ",classification_report(y_train,trainpreds5))
print("confusion matrix of test is: ",classification_report(y_test,testpreds5))
Classification matrix of train is:                precision    recall  f1-score   support

           0       0.88      0.75      0.81        40
           1       0.92      0.97      0.94       116

    accuracy                           0.91       156
   macro avg       0.90      0.86      0.88       156
weighted avg       0.91      0.91      0.91       156

confusion matrix of test is:                precision    recall  f1-score   support

           0       0.67      0.50      0.57         8
           1       0.88      0.94      0.91        31

    accuracy                           0.85        39
   macro avg       0.77      0.72      0.74        39
weighted avg       0.84      0.85      0.84        39

In [45]:
print('KappaScore is: ', metrics.cohen_kappa_score(y_test,test_preds5))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-45-cf615e1a1029> in <module>
----> 1 print('KappaScore is: ', metrics.cohen_kappa_score(y_test,test_preds5))

NameError: name 'test_preds5' is not defined
In [46]:
print('KappaScore is: ', metrics.cohen_kappa_score(y_test,testpreds5))
KappaScore is:  0.48
In [47]:
from sklearn.svm import SVC
sv = SVC()
sv.fit(x_train,y_train)
trainpreds6 =sv.predict(x_train)
print("Accuracy of train is: ",accuracy_score(y_train,trainpreds6))
testpreds6 = sv.predict(x_test)
print("Accuracy of test is:  ",accuracy_score(y_test,testpreds6))
Accuracy of train is:  0.8141025641025641
Accuracy of test is:   0.8205128205128205
In [48]:
print("Confusion matrix of train is: ",confusion_matrix(y_train,trainpreds6))
print("Confusion matrix of test is: ",confusion_matrix(y_test,testpreds6))
Confusion matrix of train is:  [[ 11  29]
 [  0 116]]
Confusion matrix of test is:  [[ 2  6]
 [ 1 30]]
In [49]:
print("Classification report of train is:  ",classification_report(y_train,trainpreds6))
print("Classification report of train is:  ",classification_report(y_test,testpreds6))
Classification report of train is:                 precision    recall  f1-score   support

           0       1.00      0.28      0.43        40
           1       0.80      1.00      0.89       116

    accuracy                           0.81       156
   macro avg       0.90      0.64      0.66       156
weighted avg       0.85      0.81      0.77       156

Classification report of train is:                 precision    recall  f1-score   support

           0       0.67      0.25      0.36         8
           1       0.83      0.97      0.90        31

    accuracy                           0.82        39
   macro avg       0.75      0.61      0.63        39
weighted avg       0.80      0.82      0.79        39

In [50]:
print('KappaScore is: ', metrics.cohen_kappa_score(y_test,testpreds6))
KappaScore is:  0.2834645669291339
In [51]:
print((y_test !=testpreds6).sum(),'/',((y_test == testpreds6).sum()+(y_test != testpreds6).sum()))
7 / 39
In [52]:
sv.predict([[116.682,131.111,111.555,0.0105,0.00009,0.00544,0.00781,0.01633,0.05233,0.482,0.02757,0.03858,0.0359,0.0827,0.01309,20.651,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634

]])
Out[52]:
array([1], dtype=int64)
In [53]:
import pickle
pickle.dump(dt,open('DecisionTreeParkinsons.pkl','wb'))
model = pickle.load(open('DecisionTreeParkinsons.pkl','rb'))
In [ ]: